set more off
*capture log close
set logtype text
set matsize 10000

local mydate: di %tdDNCY date(c(current_date), "DMY")

*cd "X:"

global output 	"O:\returns"
global data		"W:\returns\data_koe" 
global work		"W:\returns\estimation_data" 

*log using "${output}\returns_DATAPREP_2_`mydate'.log" , replace

**************************************************************************************

* This file performs the CEM matching for
* Aghion, Akcigit, Hyytinen & Toivanen : 
* A year older, a year wiser (and farther from the frontier):invention rents and human capital depreciation

**************************************************************************************


/*  
\
()
{}
[]
=
`i'
 >
 <
 |
 +
*/

* set seed 
set seed 346352014

*use "${data}\fleed_returnspanel_iq.dta", clear

* generating the IQ sample from whole panel
use "${work}\returns_DATAPREP_1_CLEANED.dta"
*drop if p_test_year == .
*save "${data}\fleed_returnspanel_iq.dta", replace


*********************************************************************************************************

* variable generation and desc stat

*********************************************************************************************************


* generate variables
* indicator for non-inventor, inventor individual, inventor in the given year
gen inventor_all 		= inventor
replace inventor_all 	= 2 if inv_1 == 1

* 0 = non-inventor
* 1 = inventor, any year
* 2 = inventor, year of invention (=AppYear)

* generate index within individual and year as multiple obs per ind per year due to patents
bysort shtun_lyh vuosi: gen win_shtun_year_ind = _n

drop if win_shtun_year_ind > 1
* generate index within individual
bysort shtun_lyh: gen win_shtun_ind = _n

*********************************************************************************************************

* cem variables

*********************************************************************************************************

* 1. education

gen educ_level 	= substr(ututku,1,1)
destring educ_level, replace

gen educ_field 	= substr(ututku,2,1)
destring educ_field, replace

* education level
* 0 = esiaste
* 1 = alempi perusaste
* 2 = ylempi perusaste
* 3 = keskiaste
* 5 = alin korkea-aste
* 6 = alempi korkeakouluaste
* 7 = ylempi korkeakouluaste
* 8 = tutkijakoulutus
* 9 = tuntematon
* our classification
* 1 base
* 2 secondary
* 3 uni
* 4 Msc/PhD
* NOTE: classification different for parents, as they are an older cohort
gen edu_lev 		= 1 /* levels  0, 1, 2, 9, NOTE: also imposes lowest educ on those with missing educ */ 
replace edu_lev		= 2 if educ_level == 3 
replace edu_lev		= 3 if educ_level == 5
replace edu_lev		= 3 if educ_level == 6
replace edu_lev		= 4 if educ_level == 7
replace edu_lev		= 5 if educ_level == 8

gen msc				= .
replace msc			= 0 if edu_lev <= 3
replace msc			= 1 if edu_lev > 3 & edu_lev != .
replace msc  		= 1 if edu_lev >= 3 & edu_lev != . & sose_bc == 1 

* education field
* 1 humanistinen ja esteettinen
* 2 opettajankoul
* 3 kauppa ja tstoala, laki, yhteiskunta ja kayttaytymistiet
* 4 tekniikka ja luonnotiede
* 5 liikenne ja tietoliikenne
* 6 hoitoala
* 7 maa ja metsa
* 8 muiden alojen

gen edu_science		= 0
replace edu_science = 1 if educ_field == 4 | educ_field == 5 | educ_field == 7 

* 2. industry
*tab toimiala2 if edu_lev != .
/*
2002

A Maatalous riistatalous ja metsatalous 01 - 02
B kalatalous							05
C kaivostoiminta ja louhinta			10 - 14
D teollisuus							15 - 37
E sahko kaasu ja vesihuolto				40 - 41
F rakentaminen							45
G tukku ja vahittaiskauppa, moottoriajoneuvojen
seka henkkoht esineiden ja kotitalousesineiden
korjaus									50 - 52
H majoitus ja ravitsemustoiminta		55
I kuljetus, varastointi ja tietoliikenne 60 - 64
J rahoitustoiminta						65 - 67
K kiinteisto vuokraus ja tutkimuspalvelut
liike-elaman palvelut					70 - 74
L julkinen hallinto ja maanpuol, 
pakollinen sosiaalivakuutus				75
M koulutus								80
N terveydenhuolto ja sosiaalipalv		85
O muut yhteiskunnalliset ja henkilo
kohtaiset palvelut						90-93
P tyonantajakotitaloudet seke kotitalouksien itse
tuottamat tavarat ja palvelut			95 - 07
Q kv jarjestot ja ulkom edustustot		98
X toimiala tuntematon					99

2008

A Maatalous metsatalous ja kalatalous	01 - 03
B kaivostoiminta ja louhinta			05 - 09
C teollisuus							10 - 33
D sahko kaasu lampohuolto, 
jaahdytysliiketoiminta					35
E vesihuolto, viemari ja jatevesihuolto jatehuolto 
ja muu ympariston puhtaanapito			36 - 39
F rakentaminen							41 - 43
G tukku ja vahittaiskauppa, moottoriajoneuvojen
ja moottoripyorien korjaus				45 - 47
H kuljetus, varastointi					49 - 53
I majoitus ja ravitsemustoiminta 		55 - 56
J informaatio ja viestinta				58 - 63
K rahoitus ja vakuutustoiminta			64 - 66
L kiinteistoalan toiminta				68
M ammatillinen tieteellinen ja 
tekninen toiminta						69 - 75
N hallinto ja tukipalvelutoiminta		77 - 82
O julkinen hallinto ja maanpuol, 
pakollinen sosiaalivakuutus				84
P koulutus								85
Q terveys ja sosiaalipalv				86 - 88
R taiteet, viihde ja virkistys			90 - 93
S muu palvelutoiminta					94 - 96
T kotitalouksien toiminta tyonantajana, kotitalouksien
eriyttamaton toiminta tavaroiden ja palvelujen
tuottamiseksi omaan kayttoon			97 - 98
U kv organisaatioiden ja toimielinten
toiminta								99
X toimiala tuntematon					00


  
*/

gen industry2		= substr(toimiala,1,2)
tab industry2 		if edu_lev != .
replace industry2	= "15" if industry2 == "DN" & vuosi < 2001
replace industry2	= "45" if industry2 == "F" & vuosi < 2001
replace industry2	= "85" if industry2 == "N" & vuosi < 2001
replace industry2	= "90" if industry2 == "O" & vuosi < 2001
replace industry2	= "95" if industry2 == "P" & vuosi < 2001
replace industry2	= "97" if industry2 == "T" & vuosi < 2001

replace industry2	= "35" if industry2 == "DN" & vuosi >= 2001
replace industry2	= "41" if industry2 == "F" & vuosi >= 2001
replace industry2	= "77" if industry2 == "N" & vuosi >= 2001
replace industry2	= "84" if industry2 == "O" & vuosi >= 2001
replace industry2	= "85" if industry2 == "P" & vuosi >= 2001
replace industry2	= "90" if industry2 == "R" & vuosi >= 2001
replace industry2	= "97" if industry2 == "T" & vuosi >= 2001

destring industry2, replace

gen manuf			= .
replace manuf		= 0 	if industry2 != .
* source: Tilastokeskus webpage toimialaluokitus 1988
replace manuf		= 1 	if industry2 >= 11 & industry2 <= 29 & vuosi<= 1992
* source: TOL95 - toinen tarkistettu painos
replace manuf		= 1 	if industry2 >= 15 & industry2 < 40 & vuosi > 1992 & vuosi <= 2000
* source: Toimialaluokitus 2002 
replace manuf		= 1 	if industry2 >= 15 & industry2 <= 37 & vuosi > 2000 & vuosi <= 2006
* source: Toimialaluokitus 2008
replace manuf		= 1 	if  industry2 >= 10 & industry2 <= 33 & vuosi > 2006 


* 3. firm size
/*
*FIRM SIZE AGGREGATED WITH hk_sykstun
bysort sykstun vuosi: gen sykstun_ind = _n
gen hk_sykstun_temp		= hk_sykstun if sykstun_ind == 1
bysort syrtun vuosi: egen empl = total(hk_sykstun_temp), missing 
*/

 *firm size by observed rows per year per syrtun
 bysort syrtun vuosi: gen firm_size = _N
 replace firm_size = . if syrtun == ""

forvalues i = 1994/2010 {
 xtile firm_size_q_`i'			 = firm_size if vuosi == `i', nq(5)
 }   

* SME status
gen sme				= .
replace sme			= 0 if firm_size >= 250 & firm_size != .
replace sme			= 1 if firm_size < 250 & firm_size != .


* 4. region
* suuralue 
* 1 southern finland
* 2 western finland
* 3 eastern finland
* 4 northern finland
* 5 aland island

destring suuralue, gen(region)

gen region_large		= .
replace region_large	= 1 if region != .
replace region_large	= 2 if region == 3
replace region_large	= 2 if region == 4

* 5. age
gen age			= .
replace age		= 1 if ika <= 30
replace age		= 2 if ika > 30 & ika <= 40
replace age		= 3 if ika > 40 & ika <= 50
replace age		= 4 if ika > 50

* 6. IQ
/*
gen iq			= 1
replace iq		= 2 if kuv_pc > 50 & kuv_pc <= 80
replace iq		= 3 if kuv_pc > 80 & kuv_pc <= 90
replace iq		= 4 if kuv_pc > 90 & kuv_pc <= 100
*/
* 7. wage growth and levels
sort shnro vuosi
bysort shnro: gen dwage			= (wage[_n - 1] - wage[_n - 2]) / (2 * wage[_n - 2]) + (wage[_n - 2] - wage[_n - 3]) / (2 * wage[_n - 3]) if wage[_n - 2] != . & wage[_n - 3] != .
replace dwage		= (wage[_n - 1] - wage[_n - 2]) / wage[_n - 2] if wage[_n - 3] == 0
replace dwage		= (wage[_n - 2] - wage[_n - 3]) / wage[_n - 3] if wage[_n - 2] == 0
tabstat dwage, stat(p10 p25 p50 p75 p90)

forvalues i = 1994/2010 {
	xtile dwage_q_`i'		=  dwage if  vuosi == `i', nq(5)
	}

sort shnro vuosi
gen wage_l1			= .
replace wage_l1		= wage[_n - 1] if shnro == shnro[_n - 1]
gen wage_l2			= .
replace wage_l2		= wage[_n - 2] if shnro == shnro[_n - 2]
gen wage_l3			= .
replace wage_l3		= wage[_n - 3] if shnro == shnro[_n - 3]
gen wage_l4			= .
replace wage_l4		= wage[_n - 4] if shnro == shnro[_n - 4]

egen wage_lavg		= rowmean(wage_l1 wage_l2 wage_l3 wage_l4)

forvalues i = 1994/2010 {
	xtile wage_lavgq_`i'		=  wage_lavg if  vuosi == `i', nq(5)
	}
	
gen wage_l14_obs		= .
replace wage_l14_obs	= 1 if wage_l1 != . & wage_l2 != . & wage_l3 != . & wage_l4 != .

* 8. dtf & suorv_d
gen dtf						= vuosi - suorv
replace dtf					= ika - 15 		if dtf > ika - 15 & dtf != .  & suorv != .

gen suorv_d				= 0
replace suorv_d			= 1 				if suorv == .

* 9. inspect cem variables
sum msc edu_science manuf sme region_large dtf
tab age 
*tab iq

*tab suuralue inventor_all if win_shtun_ind == 1

*********************************************************************************************************

* cem variables

***************************************************************************************
bysort syrtun: 				egen inv_firm		= total(inventor), missing
replace inv_firm			= . if syrtun == ""

gen inv_firm_d				= .
replace inv_firm_d			= 0 if inv_firm == 0 & syrtun != ""
replace inv_firm_d			= 1 if inv_firm > 0 & syrtun != "" & inv_firm != .

gen inventor_cem_sample		= 0

bysort syrtun vuosi: egen inv_firm_year		= total(inv_1), missing
replace inv_firm_year		= . if syrtun == ""

gen inv_firm_year_d			= .
replace inv_firm_year_d		= 0 if inv_firm_year == 0 & syrtun != ""
replace inv_firm_year_d		= 1 if inv_firm_year > 0 & syrtun != "" & inv_firm_year != .

save "${work}\returns_DATAPREP_temp_CLEANED.dta", replace

*use "${data}\returns_DATAPREP_temp_iq.dta", clear


*********************************************************************************************************
* inventor cem sample
*********************************************************************************************************
**********************************************
* use firm size quintiles instead of SME
**********************************************
/*
use "${data}\fleed_returnspanel_iq_cem.dta", clear
preserve

forvalues i = 1994/2010 {
	
	restore
	preserve
	
	keep if (inv_firm_d == 0 | (inv_1 == 1 & syrtun != "")) &  wage_l14_obs == 1 & vuosi == `i'
	
	if vuosi != 1994 {
		merge 1:1 shnro using "${data}\cemE.dta"
		drop if cem_matched == 1 & inventor == 0
		keep if _merge == 1
		drop _merge
		}
	
	display "vuosi on `i'"
	cem msc edu_science manuf empl_q_`i' region_large iq age, treatment(inv_1) k2k
	
	keep if cem_matched	== 1
	keep shnro vuosi cem_matched cem_weights
	if vuosi == 1994 {
		save "${data}\cemE.dta", replace
		}
	else {
		append using "${data}\cemE.dta"
		save "${data}\cemEm.dta", replace
		}
	}
restore	

preserve
merge 1:1 shnro vuosi using "${data}\cemE.dta"

* keeping all obs of matched individuals
bysort shnro: egen cem_sample 	= max(cem_matched)

keep if cem_sample == 1 | inventor == 1

save "${data}\cem_estim_inventorsE.dta", replace

restore



*********************************************************************************************************
* add wage growth and levels to CEM vector
*********************************************************************************************************

preserve

forvalues i = 1994/2010 {
	
	restore
	preserve
	
	keep if (inv_firm_d == 0 | (inv_1 == 1 & syrtun != "")) &  wage_l14_obs == 1 & vuosi == `i'
	
	if vuosi != 1994 {
		merge 1:1 shnro using "${data}\cemD.dta"
		drop if cem_matched == 1 & inventor == 0
		keep if _merge == 1
		drop _merge
		}
	
	display "vuosi on `i'"
	cem msc edu_science manuf empl_q_`i' region_large iq age dwage_q_`i' wage_lavgq_`i', treatment(inv_1) k2k
	
	keep if cem_matched	== 1
	keep shnro vuosi cem_matched
	if vuosi == 1994 {
		save "${data}\cemD.dta", replace
		}
	else {
		append using "${data}\cemD.dta"
		save "${data}\cemD.dta", replace
		}
	}
restore	

preserve
merge 1:1 shnro vuosi using "${data}\cemD.dta"

* keeping all obs of matched individuals
bysort shnro: egen cem_sample 	= max(cem_matched)

keep if cem_sample == 1 | inventor == 1

save "${data}\cem_estim_inventorsD.dta", replace

restore


*********************************************************************************************************

* coworker samples

*********************************************************************************************************

gen treated_cow				= .
replace treated_cow			= .
preserve
global sose_types sose_mngt_sr sose_sr sose_mngt_jr sose_jr sose_bc 

foreach j of global sose_types {
	forvalues i = 1994/2010 {
		
		restore
		preserve
		
		replace treated_cow		= .
		replace treated_cow		= 0 if inv_firm_d == 0 & syrtun != "" & `j' == 1
		replace treated_cow		= 1 if inv_firm_year_d == 1 & inventor == 0 & syrtun != "" & `j' == 1 
		
		keep if treated_cow != . &  wage_l14_obs == 1 & vuosi == `i'
		
		if vuosi != 1994 {
			merge 1:1 shnro using "${data}\cem_`j'E.dta"
			drop if cem_matched == 1 & treated_cow == 0
			keep if _merge == 1
			drop _merge
			}
		
		display "vuosi on `i'"
		cem msc edu_science manuf  empl_q_`i' region_large iq age, treatment(treated_cow) k2k
		
		keep if cem_matched	== 1
		keep shnro vuosi cem_matched treated_cow cem_weights
		if vuosi == 1994 {
			save "${data}\cem_`j'E.dta", replace
			}
		else {
			append using "${data}\cem_`j'E.dta"
			save "${data}\cem_`j'E.dta", replace
			}
		}
	}
restore	

*use "${data}\fleed_returnspanel_iq_cem.dta", clear
drop treated_cow
foreach j of global sose_types {
	preserve
	merge 1:1 shnro vuosi using "${data}\cem_`j'E.dta"

	* keeping all obs of matched individuals
	bysort shnro: egen cem_sample 	= max(cem_matched)

	keep if cem_sample == 1 

	save "${data}\cem_estim_`j'E.dta", replace

	restore
	}

*********************************************************************************************************
* exclude IQ from CEM vector
*********************************************************************************************************
*/
*use "W:\returns\data_koe\returns_DATAPREP_temp_CLEANED.dta", clear


gen treated_cow				= .
*replace treated_cow			= .
preserve
global sose_types sose_mngt_sr sose_sr sose_mngt_jr sose_jr  sose_bc

foreach j of global sose_types {
	forvalues i = 1994/2010 {
		
		restore
		preserve
		
		replace treated_cow		= .
		replace treated_cow		= 0 if inv_firm_d == 0 & syrtun != "" & `j' == 1
		replace treated_cow		= 1 if inv_firm_year_d == 1 & inventor == 0 & syrtun != "" & `j' == 1 
		
		keep if treated_cow != . &  wage_l14_obs == 1 & vuosi == `i'
		
		if vuosi != 1994 {
			merge 1:1 shnro using "${work}\returns_EST_temp`j'G.dta"
			drop if cem_matched == 1 & treated_cow == 0
			keep if _merge == 1
			drop _merge
			}
		
		display "vuosi on `i'"
		cem msc (#0) edu_science (#0) manuf (#0) firm_size_q_`i' (#0) region_large (#0) age (#0) dtf (5 10 15 20) suorv_d (#0) , treatment(treated_cow) k2k
		
		keep if cem_matched	== 1
		keep shnro vuosi cem_matched treated_cow cem_weights
		if vuosi == 1994 {
			save "${work}\returns_EST_temp`j'G.dta", replace
			}
		else {
			append using "${work}\returns_EST_temp`j'G.dta"
			save "${work}\returns_EST_temp`j'G.dta", replace
			}
		}
	}
restore	

*use "${data}\returns_DATAPREP_temp.dta", clear
drop treated_cow
foreach j of global sose_types {
	preserve
	merge 1:1 shnro vuosi using "${work}\returns_EST_temp`j'G.dta"

	* keeping all obs of matched individuals
	bysort shnro: egen cem_sample 	= max(cem_matched)

	keep if cem_sample == 1 

	save "${work}\returns_EST_`j'G.dta", replace
	erase "${work}\returns_EST_temp`j'G.dta"

	restore
	}

**********************************************
/*

NOTE: the CEM command written by Blackwell, Iacus, Porro and King 
cem: Coarsened exact matching in Stata
Stata Jornal 2009, 9, 4, 524-546
does not reproduce data 1:1 if rerun when using the k2k option as we have.
To replicate our results 1:1 request permission email
otto.toivanen@aalto.fi and/or ari.hyytinen@hanken.fi to 
get access to our estimation data

*/	
**********************************************

*log close

exit
exit
exit

*******************************************************************************************************************
*******************************************************************************************************************
*******************************************************************************************************************
